Jean-Baptiste Lecomte
Forget about R base graphics: \texttt{plot(), hist(), par(), layout(), points(), lines(),legend()
ggplot2 is based on a layer system which can be used as objects.\ Main layers
ggplot2 is based on two functions:
qplot() for quick plot
easy and fast, but too simple in most cases qplot(x, y, data=data)
ggplot()
ggplot(data=data, aes(x, y)) + layers
Always work with a data.frame
Our data frame is based on the Hecate Strait Synoptic Trawl Survey and simulated data.
Download all the codes from a Github repository: https://github.com/JBLecomte/ggplot2-Introduction.git
## 'data.frame': 1909 obs. of 18 variables:
## $ Year : int 2005 2005 2005 2005 2005 2005 2005 2005 2005 2005 ...
## $ Month : int 7 7 7 7 7 7 7 7 7 7 ...
## $ DURATION_MINUTES: int 21 20 21 21 20 20 20 21 21 20 ...
## $ AREA : Factor w/ 2 levels "5AB","5CD": 1 1 1 1 1 1 1 1 1 1 ...
## $ Avg_net_depth : num -0.316 -0.435 -0.442 -0.234 -0.171 ...
## $ Avg_net_temp : num 0.3939 0.4339 0.3004 0.1335 -0.0267 ...
## $ Date : Date, format: "2005-07-06" "2005-07-06" ...
## $ Lon : num -128 -128 -128 -128 -128 ...
## $ Lat : num 51.2 51.1 51.6 51.6 51.7 ...
## $ X : num 572025 570307 553665 551917 546338 ...
## $ Y : num 5668122 5665874 5717947 5719597 5723992 ...
## $ X_km : num 572 570 554 552 546 ...
## $ Y_km : num 5668 5666 5718 5720 5724 ...
## $ Pres : num 1 1 1 1 1 1 1 0 0 1 ...
## $ Year_fac : Factor w/ 5 levels "2005","2007",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ AREA_num : num 1 1 1 1 1 1 1 1 1 1 ...
## $ nFish : int 2 4 3 3 3 3 0 0 3 3 ...
## $ Biomass : num 4.87 8.37 7.51 7.49 6.87 ...
sp_color <- ggplot(df_data, aes(x=Avg_net_depth, y=Biomass,
color=AREA)) +
geom_point(size=1)
print(sp_color)sp_shape <- ggplot(df_data, aes(x=Avg_net_depth, y=Biomass,
color=AREA, shape=Year_fac)) +
geom_point(size=1)
print(sp_shape)sp_color_cont <- ggplot(df_data, aes(x=Avg_net_depth, y=Biomass,
color=Avg_net_temp)) +
geom_point(size=1)
print(sp_color_cont)sp_area <- ggplot(df_data, aes(x=Avg_net_depth, y=Biomass,
size=nFish)) +
geom_point(shape=17)
print(sp_area)sp_shape <- ggplot(df_data, aes(x=Avg_net_depth, y=Biomass,
color=AREA, shape=Year_fac)) +
geom_point(size=1)
print(sp_shape)sp_shape_imp1 <- sp_shape +
xlab('Average net depth (m)') +
ylab('Biomass (kg)')
print(sp_shape_imp1)scale_x_continuous(name, breaks, labels, limits, trans)
scale_y_continuous(name, breaks, labels, limits, trans)sp_shape_imp1 <- sp_shape_imp1 +
scale_x_continuous(name='Average net depth (m)',
breaks=seq(-4,7,2), limits=c(-4,7))
print(sp_shape_imp1)sp_shape_imp2 <- ggplot(df_data, aes(x=Avg_net_depth, y=Biomass,
color=AREA, shape=Year_fac)) +
geom_point(size=1) + xlab('Average net depth (m)') +
scale_shape_discrete(name="Years") + scale_color_discrete(name="Area")
print(sp_shape_imp2)sp_shape_imp3 <- sp_shape_imp2 +
ggtitle("Biomass of species X and average net depth")
print(sp_shape_imp3)sp_c <- ggplot(df_data, aes(x=Avg_net_depth, y=Biomass,
color=Year_fac)) +
geom_point(alpha=0.8, size=1) + xlab('Average net depth (m)')
print(sp_c)# Adjust luminosity and chroma
sp_c_hue <- sp_c +
scale_colour_hue(name='Year', l=70, c=150)
print(sp_c_hue)# Adjust luminosity and chroma
sp_c_hue <- sp_c +
scale_colour_hue(name='Year', l=10, c=150)
print(sp_c_hue)# Adjust luminosity and chroma
sp_c_hue <- sp_c +
scale_colour_hue(name='Year', l=70, c=50)
print(sp_c_hue)# Change range of hues used
sp_c_hue <- sp_c +
scale_colour_hue(name='Year', h=c(0, 90))
print(sp_c_hue)# Change range of hues used
sp_c_hue <- sp_c +
scale_colour_hue(name='Year', h=c(100, 300))
print(sp_c_hue)Other resources:
http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf http://research.stowers-institute.org/efg/R/Color/Chart/ColorChart.pdf
sp_c_wanderson <- sp_c +
scale_color_manual(name='Year', values=wes_palette(name="Darjeeling1"))
print(sp_c_wanderson)sp_c_manual <- sp_color +
scale_color_manual(name='Area', values=c("#E69F00", "#56B4E9"))
print(sp_c_manual)A color-blind palette:
cb_palette_black <- c("#E69F00", "#56B4E9", "#009E73", "#F0E442",
"#0072B2", "#D55E00", "#CC79A7", "#000000")Or create your own gradient palette function with 2 or more colors:
sp_color_own <- sp_c +
scale_color_manual(name='Year', values = Palette_grb(nYear))
print(sp_color_own)sp_cc <- ggplot(df_data, aes(x=Avg_net_temp, y=Avg_net_depth)) +
geom_point(alpha=0.8, size=1, aes(colour=Biomass)) +
xlab('Average net temperature (C)') + ylab('Average net depth (m)')
print(sp_cc)sp_cc_div <- sp_cc +
scale_colour_gradient2(midpoint = 100,
low = 'blue', high = 'red', mid = muted('green'))
print(sp_cc_div)sp_cc_own <- sp_cc +
scale_color_gradientn(colors = wes_palette(n=5,
name="Rushmore",
type = 'continuous'))
print(sp_cc_own)TSplot <- ggplot(data=df_data, aes(x=Date, y=Biomass)) +
geom_point() + ylim(0,max(df_data$Biomass))
print(TSplot)Summarize the data with the mean, median and quantiles for easy plotting. The ddply function from the plyr R package provides an easy way to summarize the data.
df_summary <- ddply(df_data, c('Year'), summarise,
B_mean=mean(Biomass),
B_median=median(Biomass),
B_sd=sd(Biomass),
B_q025=quantile(Biomass, probs = 0.025),
B_q05=quantile(Biomass, probs = 0.05),
B_q10=quantile(Biomass, probs = 0.10),
B_q50=quantile(Biomass, probs = 0.50),
B_q90=quantile(Biomass, probs = 0.90),
B_q95=quantile(Biomass, probs = 0.95),
B_q975=quantile(Biomass, probs = 0.975))Summarize the data with the mean, median and quantiles for easy plotting. The ddply function from the plyr R package provides an easy way to summarize the data.
## Classes 'tbl_df', 'tbl' and 'data.frame': 5 obs. of 11 variables:
## $ Year : int 2005 2007 2009 2011 2013
## $ B_mean : num 94.2 70.6 90.3 113.9 133.8
## $ B_median: num 30.2 24.5 38.6 43.7 42.3
## $ B_sd : num 156 117 151 187 223
## $ B_q025 : num 0 0 0 0 0
## $ B_q05 : num 0 0 0 0 0.0435
## $ B_q10 : num 1.06 1.08 2.49 2.9 2.69
## $ B_q50 : num 30.2 24.5 38.6 43.7 42.3
## $ B_q90 : num 319 208 245 337 384
## $ B_q95 : num 449 340 428 493 650
## $ B_q975 : num 593 467 609 677 886
TSplot_errori95 <- ggplot(data=df_summary, aes(x=Year, y=B_mean)) +
geom_line() +
geom_errorbar(aes(ymin = B_q025, ymax = B_q975), width = 0.2)
print(TSplot_errori95)TSplot_i95 <- ggplot(data=df_summary, aes(x=Year, y=B_mean)) +
geom_point() +
geom_pointrange(aes(ymin = B_q025, ymax = B_q975))
print(TSplot_i95)TSplot_i95 <- ggplot(data=df_summary, aes(x=Year, y=B_mean)) +
geom_point() + geom_pointrange(aes(ymin = B_q025, ymax = B_q975)) +
ylab('Biomass') +
scale_x_continuous(name = 'Year', breaks = seq(2005, 2013, by = 1))
print(TSplot_i95)TSplot_ri95 <- ggplot(data=df_summary, aes(x=Year, y=B_mean)) +
geom_ribbon(aes(ymin = B_q025, ymax = B_q975), fill = "blue", alpha=0.2) +
geom_point() + geom_line() + ylab('Biomass') +
scale_x_continuous(name = 'Year', breaks = seq(2005, 2013, by = 1))
print(TSplot_ri95)TSplot <- ggplot(data=df_data, aes(x=Date, y=Biomass)) +
geom_point() +
ylab('Biomass (kg)')
print(TSplot)# Months only
TSplotm <- TSplot +
scale_x_date(labels = date_format("%y/%b"), date_breaks="1 year",
date_minor_breaks = "1 month") +
theme(axis.text.x = element_text(angle=45))
print(TSplotm)# Format : Year/month
TSplotym <- TSplot +
scale_x_date(labels = date_format("%Y/%m"))
print(TSplotym)# Format : Year/month/day
lmin <- as.Date("2011-1-1"); lmax <- max(df_data$Date)
TSplotymdl <- TSplot +
scale_x_date(labels = date_format("%y/%m/%d"), limits = c(lmin, lmax))
print(TSplotymdl)## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
boxplot_TSf <- ggplot(data=df_data, aes(x=factor(Year), y=Biomass)) +
geom_boxplot()
print(boxplot_TSf)boxplot_TS_AREA <- ggplot(data=df_data, aes(x=factor(Year), y=Biomass,
colour=AREA)) +
geom_boxplot()
print(boxplot_TS_AREA)boxplot_TS_AREA <- boxplot_TS_AREA +
scale_x_discrete(name=NULL,
breaks=c("2005", "2009", "2013"),
labels=c("Year-2005", "Year-2009", "Year-2013"))
print(boxplot_TS_AREA)The easiest way to change the label in the legend (i.e. 5AB and 5CD) rename the factor levels before the plot:
df_data$AREA <- factor(df_data$AREA, levels=c("5AB", "5CD"),
labels=c("Northern area", "Southern area"))This code can also be useful for reordering the factor levels before the plot:
bplot_TS <- ggplot(df_data, aes(x=factor(Year), y=Biomass, colour=AREA)) +
geom_boxplot() +
scale_x_discrete(name=NULL,
breaks=c("2005", "2009", "2013"),
labels=c("Year-2005", "Year-2009", "Year-2013")) +
scale_color_manual(name='Area', values = cb_palette_black) +
theme(legend.position=c(0.2,0.9))
print(bplot_TS)## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
fw1 <- ggplot(data=df_data, aes(x=DURATION_MINUTES)) +
geom_histogram(binwidth=1) + facet_wrap(~ AREA)
print(fw1)fw1_free <- ggplot(data=df_data, aes(x=DURATION_MINUTES)) +
geom_histogram(binwidth=1) + facet_wrap(~ AREA , scales = 'free')
print(fw1_free)fw1_free_y <- ggplot(data=df_data, aes(x=DURATION_MINUTES)) +
geom_histogram(binwidth=1) + facet_wrap(~ AREA , scales = 'free_y')
print(fw1_free_y)fw1_col <- ggplot(data=df_data, aes(x=DURATION_MINUTES)) +
geom_histogram(binwidth=1) +
facet_wrap(~ AREA, ncol = 1, nrow = 2, scales = 'fixed')
print(fw1_col)fw2 <- ggplot(data=df_data, aes(x=DURATION_MINUTES)) +
geom_histogram(binwidth=1) + facet_wrap(~ AREA + Year_fac)
print(fw2)fw2_2 <- ggplot(data=df_data, aes(x=DURATION_MINUTES)) +
geom_histogram(binwidth=1) + facet_wrap( ~ Year_fac + AREA)
print(fw2_2)fg1_1 <- ggplot(df_data, aes(x=Avg_net_depth, y=nFish, color=AREA)) +
geom_point(shape=20) + facet_grid(. ~ Year)
print(fg1_1)fg1_2 <- ggplot(df_data, aes(x=Avg_net_depth, y=nFish, color=AREA)) +
geom_point(shape=20) + facet_grid(Year ~ .)
print(fg1_2)fg1_3 <- ggplot(df_data, aes(x=Avg_net_depth, y=nFish, color=AREA)) +
geom_point(shape=20) + facet_grid(Year ~ ., margins = TRUE)
print(fg1_3)fg2_1 <- ggplot(df_data, aes(x=Avg_net_depth, y=nFish)) +
geom_point(shape=20) + facet_grid(~ Year + AREA)
print(fg2_1)fg2_2 <- ggplot(df_data, aes(x=Avg_net_depth, y=nFish)) +
geom_point(shape=20) + facet_grid(AREA ~ Year)
print(fg2_2)fg2_3 <- ggplot(df_data, aes(x=Avg_net_depth, y=nFish)) +
geom_point(shape=20, size=1) +
facet_grid(AREA ~ Year, scales='free', space = 'free')
print(fg2_3)Replace manually names factor:
Or write a function:
fg1_3a <- ggplot(df_data, aes(x=Avg_net_depth, y=nFish, color=AREA)) +
geom_point(shape=20, size=1) +
facet_grid(Year ~ ., labeller = labeller(Year = fn_alphabetic_label))
print(fg1_3a)fg2_3b <- fg2_2 +
theme(strip.text.x = element_text(size=8, angle=45),
strip.text.y = element_text(size=12, face="bold"),
strip.background = element_rect(colour="red", fill=NA))
print(fg2_3b)It can be hard to plot on the same page different plots.
For example, it is not possible to plot on the same page a boxplot, a scatter plot, and a bar plot.
## Loading required package: grid
Choose the layout to
theme_update(axis.text = element_text(size = rel(0.8)),
axis.ticks = element_line(colour = "black", size=1),
axis.line = element_line(),
panel.background = element_rect(fill = NA, colour = NA),
panel.border = element_rect(fill = NA, colour = NA),
panel.grid.major = element_line(colour = "grey90", size = 0.2),
panel.grid.minor = element_line(colour = NA, size = 0.5),
strip.background = element_rect(fill = "grey80",colour = "grey50"),
strip.background = element_rect(fill = "grey80",colour = "grey50"),
panel.background = element_rect(fill = "transparent", colour = NA),
panel.grid.minor = element_blank(),
plot.background = element_rect(fill = "transparent", colour = NA),
legend.key = element_rect(colour = "grey80", fill= "grey80"),
legend.background = element_rect(colour = 'black', fill = NA),
legend.key = element_rect(colour = NA,fill=NA),
legend.key.size = unit(1.1,'line'),
legend.title = element_text(size = rel(1), face = 'bold'),
legend.text = element_text(size = rel(1)))sp_legend <- sp_shape_imp2 + theme(legend.position="bottom",
legend.box = "horizontal")
print(sp_legend)sp_legend2 <- sp_legend +
theme(legend.background = element_rect(fill="lightblue",
size=0.5, linetype="solid",
colour ="darkblue"))
print(sp_legend2)sp_legend2 <- sp_legend2 +
guides(shape = guide_legend(order=1),
color = guide_legend(order=2))
print(sp_legend2)The ggsave function from ggplot2 package allows to automatically save a plot.
-A website which references ggplot2 extension https://ggplot2-exts.github.io/index.html
-ggfortify and its autoplot() function allows plotting some popular R packages using a standardized approach. Diagnostic plots with Generalized Linear Models (GLM), Plotting Principal Component Analysis … -ggmcmc was developed to plot MCMC outputs from OpenBUGS, JAGS or Stan. It provides plots to assess the convergence and general behaviour of MCMC chains. -GGally regroups R packages which extend ggplot2. The main reason to use GGally is its correlation plots. -latex2exp allows to replace labels and titles with latex expression in plot.